Data Refrsh

Load libraries

library(modeltime)
library(dplyr)
library(EIAapi)
library(jsonlite)
library(gt)
library(plotly)
library(lubridate)
library(modeltime)
source("../pipeline/eia_data.R")
source("../pipeline/backtesting.R")

API Settings:

meta_json <- read_json(path = "../settings/settings.json")
s <- meta_json$series
series <- lapply(1:length(s), function(i) {
    return(data.frame(
        parent_id = s[[i]]$parent_id,
        parent_name = s[[i]]$parent_name,
        subba_id = s[[i]]$subba_id,
        subba_name = s[[i]]$subba_name
    ))
}) |>
    bind_rows()


facets_template <- list(
    parent = NULL,
    subba = NULL
)

eia_api_key <- Sys.getenv("EIA_API_KEY")

api_path <- meta_json$api_path
meta_path <- meta_json$meta_path
data_path <- meta_json$data_path
forecast_path <- meta_json$forecast_path
forecast_log_path <- meta_json$forecast_log_path
calibrated_models_path <- meta_json$calibrated_models_path

h <- meta_json$backtesting$h
lags <- meta_json$backtesting$features$lags |> unlist()
train_length <- meta_json$train_length
offset <- meta_json$offset
tz <- meta_json$timezone
meta_obj <- get_metadata(api_key = eia_api_key, api_path = api_path, meta_path = meta_path, series = series)

gt(meta_obj$request_meta)
parent subba end_act request_start end updates_available
CISO PGAE 2024-07-11 07:00:00 2024-07-11 08:00:00 2024-07-11 07:00:00 FALSE
CISO SCE 2024-07-11 07:00:00 2024-07-11 08:00:00 2024-07-11 07:00:00 FALSE
CISO SDGE 2024-07-11 07:00:00 2024-07-11 08:00:00 2024-07-11 07:00:00 FALSE
CISO VEA 2024-07-11 07:00:00 2024-07-11 08:00:00 2024-07-11 07:00:00 FALSE
m <- meta_obj$request_meta
index <- meta_obj$last_index + 1

data <- NULL
meta_new <- NULL

for (i in 1:nrow(m)) {
    facets <- facets_template
    facets$parent <- m$parent[i]
    facets$subba <- m$subba[i]
    start <- m$request_start[i]
    end <- m$end[i]
    print(paste(facets$parent, facets$subba, sep = " - "))

    if (m$updates_available[i]) {
        temp <- eia_backfill(
            start = start - lubridate::hours(24),
            end = end + lubridate::hours(24),
            offset = offset,
            api_key = eia_api_key,
            api_path = paste(api_path, "data", sep = ""),
            facets = facets
        ) |> dplyr::filter(time >= start & time <= end)

        index <- seq.POSIXt(from = start, to = end, by = "hour")
        ts_obj <- data.frame(period = index) |>
            left_join(temp, by = c("period" = "time"))
    } else {
        ts_obj <- NULL
        print("No new data is available")
    }

    meta_temp <- create_metadata(data = ts_obj, start = start, end = end, type = "refresh")

    if (is.null(ts_obj)) {
        meta_temp$parent <- m$parent[i]
        meta_temp$subba <- m$subba[i]
    }

    if (meta_temp$success) {
        print("Append the new data")
        d <- append_data(data_path = data_path, new_data = ts_obj, save = TRUE)
        meta_temp$update <- TRUE
    } else {
        meta_temp$update <- FALSE
        meta_temp$comments <- paste(meta_temp$comments, "The data refresh failed, please check the log; ", sep = "")
    }
    meta_temp$index <- NA
    meta_df <- as.data.frame(meta_temp)
    if (!is.null(ts_obj)) {
        data <- bind_rows(data, ts_obj)
    }
    meta_new <- bind_rows(meta_new, meta_df)
}
[1] "CISO - PGAE"
[1] "No new data is available"
[1] "CISO - SCE"
[1] "No new data is available"
[1] "CISO - SDGE"
[1] "No new data is available"
[1] "CISO - VEA"
[1] "No new data is available"
gt(meta_new)
index parent subba time start end start_act end_act start_match end_match n_obs na type update success comments
NA CISO PGAE 2024-07-12 04:41:28.361843 2024-07-11 08:00:00 2024-07-11 07:00:00 NA NA NA NA NA NA refresh FALSE FALSE No new data is available; The data refresh failed, please check the log;
NA CISO SCE 2024-07-12 04:41:28.363465 2024-07-11 08:00:00 2024-07-11 07:00:00 NA NA NA NA NA NA refresh FALSE FALSE No new data is available; The data refresh failed, please check the log;
NA CISO SDGE 2024-07-12 04:41:28.364919 2024-07-11 08:00:00 2024-07-11 07:00:00 NA NA NA NA NA NA refresh FALSE FALSE No new data is available; The data refresh failed, please check the log;
NA CISO VEA 2024-07-12 04:41:28.366278 2024-07-11 08:00:00 2024-07-11 07:00:00 NA NA NA NA NA NA refresh FALSE FALSE No new data is available; The data refresh failed, please check the log;
meta_updated <- append_metadata(meta_path = meta_path, new_meta = meta_new, save = TRUE, init = FALSE)
[1] "Saving the metadata file"

Plot the Series

We will use Plotly to visualize the series:

if (!is.null(data)) {
    d <- data |> arrange(subba, period)

    p <- plot_ly(d, x = ~period, y = ~value, color = ~subba, type = "scatter", mode = "lines")

    p
} else {
    print("No new data is available")
}
[1] "No new data is available"
data <- readr::read_csv(file = data_path, col_types = readr::cols(
    period = readr::col_datetime(format = ""),
    subba = readr::col_character(),
    subba_name = readr::col_character(),
    parent = readr::col_character(),
    parent_name = readr::col_character(),
    value = readr::col_double(),
    value_units = readr::col_character()
))

head(data)
# A tibble: 6 x 8
  period              subba subba_name      parent parent_name value value_units
  <dttm>              <chr> <chr>           <chr>  <chr>       <dbl> <chr>      
1 2018-07-01 08:00:00 PGAE  Pacific Gas an~ CISO   California~ 12522 megawattho~
2 2018-07-01 09:00:00 PGAE  Pacific Gas an~ CISO   California~ 11745 megawattho~
3 2018-07-01 10:00:00 PGAE  Pacific Gas an~ CISO   California~ 11200 megawattho~
4 2018-07-01 11:00:00 PGAE  Pacific Gas an~ CISO   California~ 10822 megawattho~
5 2018-07-01 12:00:00 PGAE  Pacific Gas an~ CISO   California~ 10644 megawattho~
6 2018-07-01 13:00:00 PGAE  Pacific Gas an~ CISO   California~ 10559 megawattho~
# i 1 more variable: type <chr>
p <- plot_ly(data, x = ~period, y = ~value, color = ~subba, type = "scatter", mode = "lines")

p

Refresh the forecast

fc <- NULL
input <- data
index <- "period"
var <- "value"
train_length <- 24 * 31 * 25
lags <- lags
init <- FALSE
save <- FALSE
seasonal <- TRUE
trend <- TRUE


input <- input |>
    dplyr::select(subba, !!rlang::sym(index), y = !!rlang::sym(var))

input_last_point <- input |>
    dplyr::group_by(subba) |>
    dplyr::filter(!!rlang::sym(index) == max(!!rlang::sym(index))) |>
    dplyr::ungroup() |>
    dplyr::select(subba, last_time = !!rlang::sym(index))




log <- load_forecast_log(forecast_log_path = forecast_log_path)

calibrated_models <- readRDS(calibrated_models_path) |>
    dplyr::left_join(
        log |>
            dplyr::filter(success) |>
            dplyr::group_by(subba) |>
            dplyr::filter(end == max(end)) |>
            dplyr::select(subba, method, end),
        by = c("subba", "method")
    ) |>
    dplyr::left_join(input_last_point, by = "subba") |>
    dplyr::mutate(end_filter = lubridate::floor_date(last_time, unit = "day") - lubridate::hours(1)) |>
    dplyr::mutate(refresh = ifelse(end_filter < last_time, TRUE, FALSE))

head(log)
  index subba  model  method                time forecast_label      start
1     1  PGAE     LM  model6 2024-07-12 02:45:19     2024-07-08 2024-07-08
2     2   SCE GLMNET model18 2024-07-12 02:45:19     2024-07-08 2024-07-08
3     3  SDGE     LM  model7 2024-07-12 02:45:19     2024-07-08 2024-07-08
4     4   VEA     LM  model5 2024-07-12 02:45:19     2024-07-08 2024-07-08
                  end  h n_obs n_obs_flag na_flag success score       mape
1 2024-07-08 23:00:00 24    24       TRUE   FALSE    TRUE  TRUE 0.04733980
2 2024-07-08 23:00:00 24    24       TRUE   FALSE    TRUE  TRUE 0.04590319
3 2024-07-08 23:00:00 24    24       TRUE   FALSE    TRUE  TRUE 0.07076818
4 2024-07-08 23:00:00 24    24       TRUE   FALSE    TRUE  TRUE 0.05896550
       rmse  coverage
1 773.92239 1.0000000
2 900.23786 1.0000000
3 184.66469 0.8333333
4  15.02373 0.7916667
head(calibrated_models)
# Modeltime Table
# A tibble: 4 x 12
  .model_id .model    .model_desc .type .calibration_data method partition subba
      <int> <named l> <chr>       <chr> <list>            <chr>      <int> <chr>
1         6 <fit[+]>  LM          Test  <tibble [24 x 4]> model6        20 PGAE 
2        17 <fit[+]>  GLMNET      Test  <tibble [24 x 4]> model~        20 SCE  
3         7 <fit[+]>  LM          Test  <tibble [24 x 4]> model7        20 SDGE 
4         5 <fit[+]>  LM          Test  <tibble [24 x 4]> model5        20 VEA  
# i 4 more variables: end <dttm>, last_time <dttm>, end_filter <dttm>,
#   refresh <lgl>
if (!any(calibrated_models$refresh)) {
    message("No new data is available to refresh the forecast")
} else {
    message("New data is avaiable, starting the forecast refresh process")
    calibrated_models <- calibrated_models |> dplyr::filter(refresh == TRUE)

    for (i in 1:nrow(calibrated_models)) {
        end <- calibrated_models$end_filter[i]
        start <- end - lubridate::hours(train_length)

        temp <- input |>
            dplyr::filter(
                subba == calibrated_models$subba[i],
                !!rlang::sym(index) >= start & !!rlang::sym(index) <= end
            )
        if (i == 1) {
            d <- temp
        } else {
            d <- rbind(d, temp)
        }
    }

    head(d)

    input <- d
    selected_models <- calibrated_models
    h <- h
    index <- index
    var <- "y"
    lags <- lags
    seasonal <- TRUE
    trend <- TRUE

    subba_list <- unique(input$subba)

    fc <- lapply(seq_along(subba_list), function(i) {
        m <- selected_models |> dplyr::filter(subba == subba_list[i])
        method <- m$method[1]

        df <- input |>
            dplyr::filter(subba == subba_list[i])

        print(head(df))
        input <- df
        calibrated_models <- m
        p <- max(calibrated_models$partition)
        m <- method
        new_data <- create_future_frame(input = input, index = index, h = h)

        input$y <- input[[var]]

        if (!is.null(lags)) {
            input <- input |> add_lags(index = index, var = var, lags = lags)
            new_data <- add_lags_forecast(
                input = input,
                forecast = new_data,
                index = index,
                var = var,
                lags = lags
            )
        }

        if (seasonal) {
            input <- add_seasonal(input = input, index = index)
            new_data <- add_seasonal(input = new_data, index = index)
        }

        if (trend) {
            input <- add_trend(input = input, index = index)
            new_data <- add_trend(input = new_data, index = index)
        }

        print(input)

        # fc_subba <- create_forecast(
        #     input = df,
        #     calibrated_models = m,
        #     method = method,
        #     h = h,
        #     index = index,
        #     var = var,
        #     lags = lags,
        #     seasonal = seasonal,
        #     trend = trend
        # ) |>
        #     dplyr::select(time = .index, subba, method, model = .model_desc, yhat = .value, lower = .conf_lo, upper = .conf_hi)

        # fc_subba$forecast_label <- as.character(as.Date(min(fc_subba$time)))
        return(input)
    }) |> dplyr::bind_rows()

    print(head(fc))

    # head(forecast)
}
New data is avaiable, starting the forecast refresh process
# A tibble: 6 x 3
  subba period                  y
  <chr> <dttm>              <dbl>
1 PGAE  2022-05-27 23:00:00 11545
2 PGAE  2022-05-28 00:00:00 12061
3 PGAE  2022-05-28 01:00:00 12347
4 PGAE  2022-05-28 02:00:00 12697
5 PGAE  2022-05-28 03:00:00 12897
6 PGAE  2022-05-28 04:00:00 13140
Add lag: 24
Add lag: 25
Add lag: 26
Add lag: 27
Add lag: 28
Add lag: 48
Add lag: 72
Add lag: 8760
# A tibble: 18,601 x 15
   subba period                  y lag_24 lag_25 lag_26 lag_27 lag_28 lag_48
   <chr> <dttm>              <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1 PGAE  2022-05-27 23:00:00 11545     NA     NA     NA     NA     NA     NA
 2 PGAE  2022-05-28 00:00:00 12061     NA     NA     NA     NA     NA     NA
 3 PGAE  2022-05-28 01:00:00 12347     NA     NA     NA     NA     NA     NA
 4 PGAE  2022-05-28 02:00:00 12697     NA     NA     NA     NA     NA     NA
 5 PGAE  2022-05-28 03:00:00 12897     NA     NA     NA     NA     NA     NA
 6 PGAE  2022-05-28 04:00:00 13140     NA     NA     NA     NA     NA     NA
 7 PGAE  2022-05-28 05:00:00 13037     NA     NA     NA     NA     NA     NA
 8 PGAE  2022-05-28 06:00:00 12430     NA     NA     NA     NA     NA     NA
 9 PGAE  2022-05-28 07:00:00 11611     NA     NA     NA     NA     NA     NA
10 PGAE  2022-05-28 08:00:00 11075     NA     NA     NA     NA     NA     NA
# i 18,591 more rows
# i 6 more variables: lag_72 <dbl>, lag_8760 <dbl>, month <fct>, wday <fct>,
#   hour <fct>, trend <dbl>
# A tibble: 6 x 3
  subba period                  y
  <chr> <dttm>              <dbl>
1 SCE   2022-05-27 23:00:00 11944
2 SCE   2022-05-28 00:00:00 12253
3 SCE   2022-05-28 01:00:00 12504
4 SCE   2022-05-28 02:00:00 12249
5 SCE   2022-05-28 03:00:00 12234
6 SCE   2022-05-28 04:00:00 12406
Add lag: 24
Add lag: 25
Add lag: 26
Add lag: 27
Add lag: 28
Add lag: 48
Add lag: 72
Add lag: 8760
# A tibble: 18,601 x 15
   subba period                  y lag_24 lag_25 lag_26 lag_27 lag_28 lag_48
   <chr> <dttm>              <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1 SCE   2022-05-27 23:00:00 11944     NA     NA     NA     NA     NA     NA
 2 SCE   2022-05-28 00:00:00 12253     NA     NA     NA     NA     NA     NA
 3 SCE   2022-05-28 01:00:00 12504     NA     NA     NA     NA     NA     NA
 4 SCE   2022-05-28 02:00:00 12249     NA     NA     NA     NA     NA     NA
 5 SCE   2022-05-28 03:00:00 12234     NA     NA     NA     NA     NA     NA
 6 SCE   2022-05-28 04:00:00 12406     NA     NA     NA     NA     NA     NA
 7 SCE   2022-05-28 05:00:00 12093     NA     NA     NA     NA     NA     NA
 8 SCE   2022-05-28 06:00:00 11492     NA     NA     NA     NA     NA     NA
 9 SCE   2022-05-28 07:00:00 10732     NA     NA     NA     NA     NA     NA
10 SCE   2022-05-28 08:00:00 10187     NA     NA     NA     NA     NA     NA
# i 18,591 more rows
# i 6 more variables: lag_72 <dbl>, lag_8760 <dbl>, month <fct>, wday <fct>,
#   hour <fct>, trend <dbl>
# A tibble: 6 x 3
  subba period                  y
  <chr> <dttm>              <dbl>
1 SDGE  2022-05-27 23:00:00  1236
2 SDGE  2022-05-28 00:00:00  1502
3 SDGE  2022-05-28 01:00:00  1851
4 SDGE  2022-05-28 02:00:00  2145
5 SDGE  2022-05-28 03:00:00  2327
6 SDGE  2022-05-28 04:00:00  2401
Add lag: 24
Add lag: 25
Add lag: 26
Add lag: 27
Add lag: 28
Add lag: 48
Add lag: 72
Add lag: 8760
# A tibble: 18,601 x 15
   subba period                  y lag_24 lag_25 lag_26 lag_27 lag_28 lag_48
   <chr> <dttm>              <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1 SDGE  2022-05-27 23:00:00  1236     NA     NA     NA     NA     NA     NA
 2 SDGE  2022-05-28 00:00:00  1502     NA     NA     NA     NA     NA     NA
 3 SDGE  2022-05-28 01:00:00  1851     NA     NA     NA     NA     NA     NA
 4 SDGE  2022-05-28 02:00:00  2145     NA     NA     NA     NA     NA     NA
 5 SDGE  2022-05-28 03:00:00  2327     NA     NA     NA     NA     NA     NA
 6 SDGE  2022-05-28 04:00:00  2401     NA     NA     NA     NA     NA     NA
 7 SDGE  2022-05-28 05:00:00  2338     NA     NA     NA     NA     NA     NA
 8 SDGE  2022-05-28 06:00:00  2239     NA     NA     NA     NA     NA     NA
 9 SDGE  2022-05-28 07:00:00  2061     NA     NA     NA     NA     NA     NA
10 SDGE  2022-05-28 08:00:00  1991     NA     NA     NA     NA     NA     NA
# i 18,591 more rows
# i 6 more variables: lag_72 <dbl>, lag_8760 <dbl>, month <fct>, wday <fct>,
#   hour <fct>, trend <dbl>
# A tibble: 6 x 3
  subba period                  y
  <chr> <dttm>              <dbl>
1 VEA   2022-05-27 23:00:00    91
2 VEA   2022-05-28 00:00:00    90
3 VEA   2022-05-28 01:00:00    87
4 VEA   2022-05-28 02:00:00    83
5 VEA   2022-05-28 03:00:00    80
6 VEA   2022-05-28 04:00:00    76
Add lag: 24
Add lag: 25
Add lag: 26
Add lag: 27
Add lag: 28
Add lag: 48
Add lag: 72
Add lag: 8760
# A tibble: 18,601 x 15
   subba period                  y lag_24 lag_25 lag_26 lag_27 lag_28 lag_48
   <chr> <dttm>              <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
 1 VEA   2022-05-27 23:00:00    91     NA     NA     NA     NA     NA     NA
 2 VEA   2022-05-28 00:00:00    90     NA     NA     NA     NA     NA     NA
 3 VEA   2022-05-28 01:00:00    87     NA     NA     NA     NA     NA     NA
 4 VEA   2022-05-28 02:00:00    83     NA     NA     NA     NA     NA     NA
 5 VEA   2022-05-28 03:00:00    80     NA     NA     NA     NA     NA     NA
 6 VEA   2022-05-28 04:00:00    76     NA     NA     NA     NA     NA     NA
 7 VEA   2022-05-28 05:00:00    71     NA     NA     NA     NA     NA     NA
 8 VEA   2022-05-28 06:00:00    65     NA     NA     NA     NA     NA     NA
 9 VEA   2022-05-28 07:00:00    60     NA     NA     NA     NA     NA     NA
10 VEA   2022-05-28 08:00:00    55     NA     NA     NA     NA     NA     NA
# i 18,591 more rows
# i 6 more variables: lag_72 <dbl>, lag_8760 <dbl>, month <fct>, wday <fct>,
#   hour <fct>, trend <dbl>
# A tibble: 6 x 15
  subba period                  y lag_24 lag_25 lag_26 lag_27 lag_28 lag_48
  <chr> <dttm>              <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>  <dbl>
1 PGAE  2022-05-27 23:00:00 11545     NA     NA     NA     NA     NA     NA
2 PGAE  2022-05-28 00:00:00 12061     NA     NA     NA     NA     NA     NA
3 PGAE  2022-05-28 01:00:00 12347     NA     NA     NA     NA     NA     NA
4 PGAE  2022-05-28 02:00:00 12697     NA     NA     NA     NA     NA     NA
5 PGAE  2022-05-28 03:00:00 12897     NA     NA     NA     NA     NA     NA
6 PGAE  2022-05-28 04:00:00 13140     NA     NA     NA     NA     NA     NA
# i 6 more variables: lag_72 <dbl>, lag_8760 <dbl>, month <fct>, wday <fct>,
#   hour <fct>, trend <dbl>
# fc <- refresh_forecast(
#     input = data,
#     forecast_log_path = forecast_log_path,
#     forecast_path = forecast_path,
#     calibrated_models_path = calibrated_models_path,
#     h = h,
#     index = "period",
#     var = "value",
#     train_length = 24 * 31 * 25,
#     lags = lags,
#     init = FALSE,
#     save = TRUE,
#     seasonal = TRUE,
#     trend = TRUE
# )
# if (!is.null(fc)) {
#     head(fc)
#     plot_forecast(
#         input = data,
#         forecast = fc,
#         var = "value",
#         index = "period",
#         hours = 24 * 3
#     )
# }

Score the Forecast

# fc_log <- load_forecast_log(forecast_log_path = forecast_log_path)

# score_rows <- which(!fc_log$score)


# if (length(score_rows) == 0) {
#     message("All models were scored ")
# } else {
#     subba <- unique(fc_log$subba[score_rows])

#     fc <- load_forecast(forecast_path = forecast_path)

#     for (i in subba) {
#         print(i)
#         d <- data |> dplyr::filter(subba == i)

#         r <- which(fc_log$subba == i & !fc_log$score)

#         for (l in r) {
#             f <- fc |>
#                 dplyr::filter(forecast_label == fc_log$forecast_label[l], subba == i) |>
#                 dplyr::left_join(d |> dplyr::select(time = period, subba, value), by = c("time", "subba")) |>
#                 dplyr::filter(!is.na(value))


#             fc_log$mape[l] <- mean(abs(f$value - f$yhat) / f$value)

#             fc_log$rmse[l] <- (mean((f$value - f$yhat)^2))^0.5

#             fc_log$coverage[l] <- length(which(f$value <= f$upper & f$value >= f$lower)) / nrow(f)

#             if (nrow(f) == fc_log$h[l]) {
#                 fc_log$score[l] <- TRUE
#             }

#             write.csv(fc_log, forecast_log_path, row.names = FALSE)
#         }
#     }

#     gt::gt(fc_log[score_rows, ])
# }